Here, I write code that tests the whether forward feature selection makes sense in the context of unsupervised randomForest.
usVr = function(x, seed=42) {
set.seed(seed)
x_fake = x %>% mutate(across(everything(), ~ sample(.x, size=nrow(x))))
varReduct(predictor_vars=rbind(x, x_fake),
response_var=factor(c(rep('real', nrow(x)), rep('fake', nrow(x_fake)))),
varReduct_method='forwardSelect',
algo='randomForest',
select_method='elbow_and_gap',
trace=F,
# num_jobs=1,
seed=42)
}usrf = function(x, seed=42, ntree=1000) {
set.seed(seed)
x_fake = x %>% mutate(across(everything(), ~ sample(.x, size=nrow(x))))
rf = randomForest::randomForest(x=rbind(x, x_fake),
y=factor(c(rep('real', nrow(x)), rep('fake', nrow(x_fake)))),
importance=T,
keep.forest=F,
ntree=ntree
)
rf
}varimpcomp_plot = function(...) {
dots = list(...)
if(is.null(names(dots))) {
names(dots) = unlist(lapply(substitute(list(...))[-1], deparse))
}
ks = sapply(dots, function(x) nrow(importance(x)))
tmp = bind_rows(lapply(seq_along(dots), function(i) {
x = dots[[i]]
tmp = importance(x) %>%
tibble::rownames_to_column('variable') %>%
mutate(run=names(dots)[i]) %>%
mutate(vimp_rank=rank(-overall))
if(ks[i] > min(ks)) {
tmp$vimp_rank = tmp$vimp_rank - 1
}
tmp
}))
tmp2 = tmp %>% group_by(variable) %>%
summarize(vimp_rank=mean(vimp_rank)) %>%
arrange(-vimp_rank) %>%
pull(variable)
tmp$variable = factor(tmp$variable, levels=tmp2)
ggplot(tmp) +
aes(x=vimp_rank, y=variable) +
geom_segment(aes(xend=0, group=run), color="grey69", linetype="dashed",
position=position_dodge(width=0.5)) +
geom_point(aes(color=run), position=position_dodge(width=0.5)) +
scale_color_brewer(name="", palette="Dark2") +
xlab("rank( variable importance )") +
ylab("") +
scale_x_continuous(breaks=scales::breaks_pretty()) +
guides(colour=guide_legend(reverse=T, position="bottom")) +
theme_minimal()
}vimp_spearman_plot = function(...) {
dots = list(...)
if(is.null(names(dots))) {
names(dots) = unlist(lapply(substitute(list(...))[-1], deparse))
}
vimps = lapply(names(dots), function(name) {
x = dots[[name]]
tmp = importance(x) %>%
tibble::rownames_to_column('variable') %>%
select(!important)
colnames(tmp)[colnames(tmp) == 'overall'] = name
tmp
})
tmp = reduce(vimps, inner_join, by='variable') %>%
tibble::column_to_rownames('variable')
tmp = cor(tmp, method='spearman')
tmp = as.data.frame(tmp) %>%
tibble::rownames_to_column('var1') %>%
pivot_longer(-var1, names_to='var2', values_to='value') %>%
mutate(var1 = factor(var1, levels=colnames(tmp)),
var2 = factor(var2, levels=colnames(tmp)))
ggplot(tmp) +
aes(x=var1, y=var2, fill=value) +
geom_tile() +
geom_text(aes(label=round(value, 2))) +
scale_fill_gradient2(name="Spearman\ncorrelation", midpoint=0, low="blue",
mid="white", high="red", limits=c(-1, 1)) +
xlab("") +
ylab("") +
theme_minimal() +
theme(axis.text.x = element_text(angle=60, vjust=1, hjust=1)) +
coord_fixed()
}summary2df = function(x) {
data.frame(unclass(summary(x)), check.names = FALSE, stringsAsFactors = FALSE) %>%
`rownames<-`( NULL )
}combineDataSets = function(..., seed=42) {
set.seed(seed)
dfs = list(...)
if(is.null(names(dfs))) {
names(dfs) = unlist(lapply(substitute(list(...))[-1], deparse))
}
n = min(sapply(dfs, nrow))
bind_cols(lapply(names(dfs), function(name) {
df = dfs[[name]]
colnames(df) = paste(name, colnames(df), sep='__')
if(nrow(df) == n) { return(df) }
# k = df %>% pull(ends_with('Class')) %>% unique() %>% length()
df %>%# group_by(across(ends_with('Class'))) %>%
slice_sample(n=n)
}))
}| Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species |
|---|---|---|---|---|
| Min. :4.30 | Min. :2.00 | Min. :1.00 | Min. :0.1 | setosa :50 |
| 1st Qu.:5.10 | 1st Qu.:2.80 | 1st Qu.:1.60 | 1st Qu.:0.3 | versicolor:50 |
| Median :5.80 | Median :3.00 | Median :4.35 | Median :1.3 | virginica :50 |
| Mean :5.84 | Mean :3.06 | Mean :3.76 | Mean :1.2 | NA |
| 3rd Qu.:6.40 | 3rd Qu.:3.30 | 3rd Qu.:5.10 | 3rd Qu.:1.8 | NA |
| Max. :7.90 | Max. :4.40 | Max. :6.90 | Max. :2.5 | NA |
vr1 = cache_rds({
varReduct(predictor_vars=iris[,1:4],
response_var=factor(iris$Species),
varReduct_method='forwardSelect',
algo='randomForest',
trace=F,
num_jobs=4,
seed=42)
}, rerun=F, clean=T)
vr1$best_iter
vr1$eval_vec
vr1$num_vars_vec
importance(vr1)## eval_type
## 2
## $err
## [1] 0.07333 0.03333
##
## $aic
## [1] 2868.74 37.76
##
## $aicc
## [1] 2868.77 37.84
##
## [1] 1 2
| overall | important | |
|---|---|---|
| Petal.Length | 475.32 | TRUE |
| Petal.Width | 460.21 | TRUE |
| Sepal.Length | 154.88 | FALSE |
| Sepal.Width | 69.21 | FALSE |
vr2 = cache_rds({
usVr(iris)
}, rerun=F, clean=T)
vr2$best_iter
vr2$eval_vec
vr2$num_vars_vec
importance(vr2)## eval_type
## 0
## $err
## [1] 0.82000 0.17667 0.08333
##
## $aic
## [1] 461.6 321.8 217.9
##
## $aicc
## [1] 461.6 321.8 218.0
##
## [1] 1 2 3
| overall | important | |
|---|---|---|
| Species | 1014.7 | TRUE |
| Petal.Length | 968.1 | TRUE |
| Petal.Width | 910.3 | TRUE |
| Sepal.Length | 753.2 | TRUE |
| Sepal.Width | 410.5 | TRUE |
vr3 = cache_rds({
usVr(iris[,1:4])
}, rerun=F, clean=T)
vr3$best_iter
vr3$eval_vec
vr3$num_vars_vec
importance(vr3)## eval_type
## 0
## $err
## [1] 0.9967 0.3200 0.1700
##
## $aic
## [1] 639.2 393.5 295.5
##
## $aicc
## [1] 639.2 393.5 295.5
##
## [1] 1 2 3
| overall | important | |
|---|---|---|
| Petal.Length | 969.3 | TRUE |
| Petal.Width | 887.9 | TRUE |
| Sepal.Length | 867.9 | TRUE |
| Sepal.Width | 474.7 | TRUE |
p1 = varimpcomp_plot(supervised=vr1, unsupervised_wtl=vr2, unsupervised_wotl=vr3)
p2 = vimp_spearman_plot(supervised=vr1, unsupervised_wtl=vr2, unsupervised_wotl=vr3)
p1 + p2plot_vr = function(vr) {
tmp = as.data.frame(vr) %>%
mutate(across(c(AIC, `Corrected AIC`, `Out-of-bag error`),
~ if(max(.x) == 0) 0 else .x/max(.x))) %>%
pivot_longer(c(AIC, `Corrected AIC`, `Out-of-bag error`))
ggplot(tmp) +
aes(x=`Number of Variables`, y=value, color=name) +
geom_line(alpha=0.5) +
geom_point(alpha=0.5) +
scale_color_brewer(name="", palette="Dark2") +
scale_x_continuous(breaks=scales::breaks_pretty(6)) +
ylab("Relative value") +
theme_minimal()
}Observations
Petal features), indicating there is some
stochasticity in itimp = importance(vr3)
rfs = cache_rds({
lapply(1:nrow(imp), function(i) {
vars = rownames(imp)[1:i]
usrf(iris[,vars,drop=F], ntree=5000)
})
}, rerun=F, clean=T)
y_true = rfs[[3]]$ytmp = data.frame(
pred_real1 = rfs[[1]]$votes[,1],
pred_real2 = rfs[[2]]$votes[,1],
pred_real3 = rfs[[3]]$votes[,1],
pred_real4 = rfs[[4]]$votes[,1],
class = rep(c('real', 'fake'), each=nrow(iris)),
sample = 1:(2*nrow(iris))
) %>% pivot_longer(!c(class, sample))
ggplot(tmp) +
geom_point(aes(x=sample, y=value, color=name, shape=class),
alpha=0.5) +
ylab('Proportion votes for fake class') +
theme_minimal()aic_fixed = sapply(seq_along(rfs), function(i) {
clfAIC(y_true[1:nrow(iris)], y_pred=rfs[[i]]$votes[1:nrow(iris),], k=i)
})
tmp = as.data.frame(vr3) %>%
mutate(across(c(AIC, `Corrected AIC`, `Out-of-bag error`),
~ if(max(.x) == 0) 0 else .x/max(.x))) %>%
pivot_longer(c(AIC, `Corrected AIC`, `Out-of-bag error`))
tmp2 = data.frame(NA, seq_along(rfs), NA, NA, NA, NA, "AIC_observed_only", aic_fixed/max(aic_fixed))
colnames(tmp2) = colnames(tmp)
tmp = bind_rows(tmp, tmp2)
ggplot(tmp) +
aes(x=`Number of Variables`, y=value, color=name) +
geom_line(alpha=0.5) +
geom_point(alpha=0.5) +
scale_color_brewer(name="", palette="Dark2") +
scale_x_continuous(breaks=scales::breaks_pretty(6)) +
ylim(0, 1) +
ylab("Relative value") +
theme_minimal()data("BreastCancer", package = "mlbench")
BreastCancer = BreastCancer %>%
select(!Id) %>%
mutate(across(!Class, as.numeric)) %>%
filter(!is.na(Bare.nuclei))
summary2df(BreastCancer)| Cl.thickness | Cell.size | Cell.shape | Marg.adhesion | Epith.c.size | Bare.nuclei | Bl.cromatin | Normal.nucleoli | Mitoses | Class |
|---|---|---|---|---|---|---|---|---|---|
| Min. : 1.00 | Min. : 1.00 | Min. : 1.00 | Min. : 1.00 | Min. : 1.00 | Min. : 1.00 | Min. : 1.00 | Min. : 1.00 | Min. :1.00 | benign :444 |
| 1st Qu.: 2.00 | 1st Qu.: 1.00 | 1st Qu.: 1.00 | 1st Qu.: 1.00 | 1st Qu.: 2.00 | 1st Qu.: 1.00 | 1st Qu.: 2.00 | 1st Qu.: 1.00 | 1st Qu.:1.00 | malignant:239 |
| Median : 4.00 | Median : 1.00 | Median : 1.00 | Median : 1.00 | Median : 2.00 | Median : 1.00 | Median : 3.00 | Median : 1.00 | Median :1.00 | NA |
| Mean : 4.44 | Mean : 3.15 | Mean : 3.21 | Mean : 2.83 | Mean : 3.23 | Mean : 3.54 | Mean : 3.44 | Mean : 2.87 | Mean :1.58 | NA |
| 3rd Qu.: 6.00 | 3rd Qu.: 5.00 | 3rd Qu.: 5.00 | 3rd Qu.: 4.00 | 3rd Qu.: 4.00 | 3rd Qu.: 6.00 | 3rd Qu.: 5.00 | 3rd Qu.: 4.00 | 3rd Qu.:1.00 | NA |
| Max. :10.00 | Max. :10.00 | Max. :10.00 | Max. :10.00 | Max. :10.00 | Max. :10.00 | Max. :10.00 | Max. :10.00 | Max. :9.00 | NA |
vr1 = cache_rds({
varReduct(predictor_vars=BreastCancer %>% select(-Class),
response_var=factor(BreastCancer$Class),
varReduct_method='forwardSelect',
algo='randomForest',
trace=F,
preprocess=F,
seed=42)
}, rerun=F, clean=T)
vr1$best_iter
vr1$eval_vec
vr1$num_vars_vec
importance(vr1)## eval_type
## 0
## $err
## [1] 0.08785 0.04832
##
## $aic
## [1] 38519 3009
##
## $aicc
## [1] 38519 3009
##
## [1] 1 2
| overall | important | |
|---|---|---|
| Bare.nuclei | 387.95 | TRUE |
| Cell.shape | 281.19 | TRUE |
| Cell.size | 277.02 | TRUE |
| Cl.thickness | 275.58 | TRUE |
| Bl.cromatin | 235.84 | TRUE |
| Marg.adhesion | 173.01 | TRUE |
| Normal.nucleoli | 161.45 | TRUE |
| Epith.c.size | 157.22 | TRUE |
| Mitoses | 87.44 | TRUE |
vr2 = cache_rds({
usVr(BreastCancer)
}, rerun=F, clean=T)
vr2$best_iter
vr2$eval_vec
vr2$num_vars_vec
importance(vr2)## eval_type
## 3
## $err
## [1] 0.67862 0.32138 0.20791 0.15886 0.14275 0.11493 0.09810 0.08638
##
## $aic
## [1] 2056 1857 1534 115519 49126 44587 26117 14783
##
## $aicc
## [1] 2056 1857 1534 115519 49126 44587 26117 14784
##
## [1] 1 2 3 4 5 6 7 8
| overall | important | |
|---|---|---|
| Cell.size | 1411.0 | TRUE |
| Class | 1400.6 | TRUE |
| Cell.shape | 1390.7 | TRUE |
| Bare.nuclei | 1062.7 | FALSE |
| Cl.thickness | 1043.7 | FALSE |
| Normal.nucleoli | 1024.8 | FALSE |
| Bl.cromatin | 980.7 | FALSE |
| Marg.adhesion | 905.7 | FALSE |
| Epith.c.size | 747.3 | FALSE |
| Mitoses | 384.5 | FALSE |
vr3 = cache_rds({
usVr(BreastCancer[,1:(ncol(BreastCancer)-1)])
}, rerun=F, clean=T)
vr3$best_iter
vr3$eval_vec
vr3$num_vars_vec
importance(vr3)## eval_type
## 3
## $err
## [1] 0.7160 0.3551 0.2328 0.1977 0.1662 0.1362 0.1186 0.1025
##
## $aic
## [1] 2053 2125 1640 105788 66036 56003 29147 19166
##
## $aicc
## [1] 2053 2125 1640 105788 66036 56003 29147 19166
##
## [1] 1 2 3 4 5 6 7 8
| overall | important | |
|---|---|---|
| Cell.shape | 1530.8 | TRUE |
| Bare.nuclei | 1318.0 | TRUE |
| Cell.size | 1217.0 | TRUE |
| Normal.nucleoli | 1161.0 | FALSE |
| Marg.adhesion | 1123.7 | FALSE |
| Bl.cromatin | 1084.2 | FALSE |
| Cl.thickness | 1061.4 | FALSE |
| Epith.c.size | 1044.5 | FALSE |
| Mitoses | 535.2 | FALSE |
p1 = varimpcomp_plot(supervised=vr1, unsupervised_wtl=vr2, unsupervised_wotl=vr3)
p2 = vimp_spearman_plot(supervised=vr1, unsupervised_wtl=vr2, unsupervised_wotl=vr3)
p1 + p2imp = importance(vr2)
rfs = cache_rds({
lapply(1:nrow(imp), function(i) {
vars = rownames(imp)[1:i]
usrf(BreastCancer[,vars,drop=F], ntree=5000)
})
}, rerun=F, clean=T)
y_true = rfs[[3]]$ytmp = data.frame(
pred_real3 = rfs[[3]]$votes[,1],
pred_real4 = rfs[[4]]$votes[,1],
class = rep(c('real', 'fake'), each=nrow(BreastCancer)),
sample = 1:(2*nrow(BreastCancer))
) %>% pivot_longer(!c(class, sample))
ggplot(tmp) +
geom_point(aes(x=sample, y=value, color=name, shape=class),
alpha=0.5) +
ylab('Proportion votes for fake class') +
theme_minimal()tmp = data.frame(
pred_real1 = rfs[[1]]$votes[,1],
pred_real2 = rfs[[2]]$votes[,1],
pred_real3 = rfs[[3]]$votes[,1],
pred_real4 = rfs[[4]]$votes[,1],
class = rep(c('real', 'fake'), each=nrow(BreastCancer)),
sample = 1:(2*nrow(BreastCancer))
) %>% pivot_longer(!c(class, sample))
ggplot(tmp) +
geom_point(aes(x=sample, y=value, color=name, shape=class),
alpha=0.5) +
ylab('Proportion votes for fake class') +
theme_minimal()aic_fixed = sapply(seq_along(rfs), function(i) {
clfAIC(y_true[1:nrow(BreastCancer)], y_pred=rfs[[i]]$votes[1:nrow(BreastCancer),], k=i)
})
tmp = as.data.frame(vr2) %>%
mutate(across(c(AIC, `Corrected AIC`, `Out-of-bag error`),
~ if(max(.x) == 0) 0 else .x/max(.x))) %>%
pivot_longer(c(AIC, `Corrected AIC`, `Out-of-bag error`))
tmp2 = data.frame(NA, seq_along(rfs), NA, NA, NA, NA, "AIC_observed_only", aic_fixed/max(aic_fixed))
colnames(tmp2) = colnames(tmp)
tmp = bind_rows(tmp, tmp2)
ggplot(tmp) +
aes(x=`Number of Variables`, y=value, color=name) +
geom_line(alpha=0.5) +
geom_point(alpha=0.5) +
scale_color_brewer(name="", palette="Dark2") +
scale_x_continuous(breaks=scales::breaks_pretty(6)) +
ylab("Relative value") +
ylim(0, 1) +
theme_minimal()Here I’m going to sweep over the classification data sets available
inthe mlbench package and report the results in a
table.
I standardize all of the data sets such that the class label is
assigned to the column Class.
getdata <- function(...) {
e = new.env()
name = data(..., envir = e)[1]
e[[name]]
}
data_sets = list(
iris=iris,
BreastCancer=BreastCancer,
DNA=getdata("DNA"),
Glass=getdata("Glass"),
HouseVotes84=getdata("HouseVotes84"),
Ionosphere=getdata("Ionosphere"),
# LetterRecognition=getdata("LetterRecognition"),
PimaIndiansDiabetes=getdata("PimaIndiansDiabetes"),
Satellite=getdata("Satellite"),
# Servo=getdata("Servo"),
# Shuttle=getdata("Shuttle"),
Sonar=getdata("Sonar"),
# Soybean=getdata("Soybean"),
Vehicle=getdata("Vehicle"),
# Vowel=getdata("Vowel"),
Zoo=getdata("Zoo")
)
data_sets = lapply(data_sets, function(x) {
rn = c('lettr', 'Type', 'diabetes', 'type', 'classes', 'Species')
if(any(colnames(x) %in% rn)) {
colnames(x)[colnames(x) %in% rn] = 'Class'
}
x[complete.cases(x),]
})supervised_rfs = cache_rds({
tmp = lapply(data_sets, function(x) {
ic(dim(x))
y = x$Class
rf = randomForest::randomForest(x = x %>% select(-Class),
y = factor(y),
sampsize=rep(min(table(y)), length(table(y))),
strata=factor(y), importance=T, keep.forest=F, ntree=10000)
rf
})
names(tmp) = names(data_sets)
tmp
}, rerun=F, clean=T)
ds_info = bind_rows(lapply(names(data_sets), function(n) {
x = data_sets[[n]]
y = x$Class
rf = supervised_rfs[[n]]
data.frame(name=n,
num_obs=nrow(x),
num_pred_vars=ncol(x) - 1,
num_classes=length(table(y)),
base_accuracy=round(max(table(y)/nrow(x)), digits=3),
rf_accuracy=round(1 - rf$err.rate[10000, 1], digits=3)
)
}))
rownames(ds_info) = NULL
#supervised_rfs[[1]]$err.rate[10000,]
reactable(ds_info)unsupervised_rfs1 = cache_rds({
tmp = lapply(names(data_sets), function(n) {
ic(n)
usVr(data_sets[[n]] %>% select(-Class))
})
names(tmp) = names(data_sets)
tmp
}, rerun=F, clean=T)I have 11 data sets here, most of which yield a random forest classification accuracy > 0.9. Only 4 data sets have an rf accuracy < 0.9, with the lowest being the Glass data set with an out-of-bag accuracy of 0.66.
One thing I am curious about is what happens with a diluted clustering signal. One way of testing this is by adding features with little correlation to the class labels or an existing feature.
add_noise_numeric = function(df, n=10, feat=df[,1], r=0.05, prefix="noise.", seed=42) {
set.seed(seed)
tmp = do.call('data.frame', lapply(1:n, faux::rnorm_pre, x=feat, r=r))
colnames(tmp) = paste0(prefix, 1:n)
cbind(df, tmp)
}
add_noise_categorical = function(df, n=10, feat=df[,1], r=0.05, prefix="noise.", seed=42) {
set.seed(seed)
#TODO
}Another way a signal might be diluted is if the data supports multiple sets of clusters. We can simulate this situation by combining two data sets column-wise, randomizing the class labels in one of them such that each sample belongs to two classes, one for each data set (e.g., sample 1 is simultaneously a versicolor and a breast cancer sample).
An example of what a combined data set might look like:
| iris__Sepal.Length | iris__Sepal.Width | iris__Petal.Length | iris__Petal.Width | iris__Species | BreastCancer__Cl.thickness | BreastCancer__Cell.size | BreastCancer__Cell.shape | BreastCancer__Marg.adhesion | BreastCancer__Epith.c.size | BreastCancer__Bare.nuclei | BreastCancer__Bl.cromatin | BreastCancer__Normal.nucleoli | BreastCancer__Mitoses | BreastCancer__Class |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. :4.30 | Min. :2.00 | Min. :1.00 | Min. :0.1 | setosa :50 | Min. : 1.00 | Min. : 1.00 | Min. : 1.00 | Min. : 1.0 | Min. : 1.00 | Min. : 1.00 | Min. : 1.0 | Min. : 1.00 | Min. :1.00 | benign :105 |
| 1st Qu.:5.10 | 1st Qu.:2.80 | 1st Qu.:1.60 | 1st Qu.:0.3 | versicolor:50 | 1st Qu.: 1.00 | 1st Qu.: 1.00 | 1st Qu.: 1.00 | 1st Qu.: 1.0 | 1st Qu.: 2.00 | 1st Qu.: 1.00 | 1st Qu.: 2.0 | 1st Qu.: 1.00 | 1st Qu.:1.00 | malignant: 45 |
| Median :5.80 | Median :3.00 | Median :4.35 | Median :1.3 | virginica :50 | Median : 3.00 | Median : 1.00 | Median : 1.00 | Median : 1.0 | Median : 2.00 | Median : 1.00 | Median : 3.0 | Median : 1.00 | Median :1.00 | NA |
| Mean :5.84 | Mean :3.06 | Mean :3.76 | Mean :1.2 | NA | Mean : 4.15 | Mean : 2.86 | Mean : 2.88 | Mean : 2.5 | Mean : 3.14 | Mean : 3.17 | Mean : 3.2 | Mean : 2.43 | Mean :1.52 | NA |
| 3rd Qu.:6.40 | 3rd Qu.:3.30 | 3rd Qu.:5.10 | 3rd Qu.:1.8 | NA | 3rd Qu.: 5.00 | 3rd Qu.: 4.00 | 3rd Qu.: 4.00 | 3rd Qu.: 3.0 | 3rd Qu.: 3.00 | 3rd Qu.: 5.00 | 3rd Qu.: 3.0 | 3rd Qu.: 2.00 | 3rd Qu.:1.00 | NA |
| Max. :7.90 | Max. :4.40 | Max. :6.90 | Max. :2.5 | NA | Max. :10.00 | Max. :10.00 | Max. :10.00 | Max. :10.0 | Max. :10.00 | Max. :10.00 | Max. :10.0 | Max. :10.00 | Max. :9.00 | NA |
Ignore the “Observed” and “Predicted” labels as these are reused from my confusion matrix code.
The table above shows that we now have 39 benign-setosa samples, 31 benign-versicolor samples, 11 malignant-setosa samples, etc.
I do not enforce that the class labels across the two data sets are perfectly randomized, so there is some slight imbalances in the combined classes, but in general, they are quite randomized.
irisbc_rf = cache_rds({
x = test1
set.seed(42)
x_fake = x %>% mutate(across(everything(), ~ sample(.x, size=nrow(x))))
rf = randomForest::randomForest(x=rbind(x, x_fake),
y =factor(c(rep('real', nrow(x)), rep('fake', nrow(x_fake)))),
importance=T,
keep.forest=T,
ntree=10000)
rf
}, rerun=F, clean=T)as.data.frame(importance(irisbc_rf)[,3, drop=F]) %>%
rename(overall=MeanDecreaseAccuracy) %>%
tibble::rownames_to_column('variable') %>%
arrange(-overall)| variable | overall |
|---|---|
| BreastCancer__Class | 122.55 |
| BreastCancer__Cell.shape | 116.04 |
| BreastCancer__Cl.thickness | 97.75 |
| BreastCancer__Cell.size | 96.68 |
| BreastCancer__Bare.nuclei | 94.68 |
| iris__Petal.Length | 94.37 |
| BreastCancer__Normal.nucleoli | 93.74 |
| BreastCancer__Epith.c.size | 91.48 |
| iris__Petal.Width | 91.45 |
| iris__Sepal.Length | 89.21 |
| iris__Species | 89.14 |
| BreastCancer__Marg.adhesion | 85.20 |
| BreastCancer__Bl.cromatin | 63.53 |
| BreastCancer__Mitoses | 51.39 |
| iris__Sepal.Width | 38.68 |
We can create an algorithm that traverses the forest and identifies co-occurence of features together. I suspect that this may distinguish the features that support one set of cluster assignments, and the features that support a second set of cluster assignments.
#' Count the times in a forest that two variables are chosen for adjacent nodes
traverse1 = function(rf, ncores=1) {
forest = rf$forest
d = nrow(importance(rf)) # number of variables
adj = Reduce('+', mclapply(1:rf$ntree, function(k) {
ar = matrix(0, nrow=d, ncol=d)
bestvar = forest$bestvar[,k]
ld = forest$treemap[,1,k]
rd = forest$treemap[,2,k]
for(i in 1L:length(bestvar)) {
if(bestvar[i] == 0) next
if(bestvar[ld[i]] > 0)
ar[bestvar[i], bestvar[ld[i]]] = ar[bestvar[i], bestvar[ld[i]]] + 1
if(bestvar[rd[i]] > 0)
ar[bestvar[i], bestvar[rd[i]]] = ar[bestvar[i], bestvar[rd[i]]] + 1
}
ar
}, mc.cores=ncores))
colnames(adj) = rownames(adj) = rownames(importance(rf))
adj = adj + t(adj)
fact = table(rf$forest$bestvar)
fact = as.numeric(fact[as.character(1:nrow(adj))])
fact[is.na(fact)] = 1
t(adj / fact) / fact
}
#head(randomForest::getTree(irisbc_rf, 1))#' Count the times in a forest that two variables are chosen within depth $d$
traverse2 = function(rf, depth=3, ncores=1) {
forest = rf$forest
d = nrow(importance(rf)) # number of variables
adj = Reduce('+', mclapply(1:rf$ntree, function(k) {
ar = matrix(0, nrow=d, ncol=d)
bestvar = forest$bestvar[,k]
ld = forest$treemap[,1,k]
rd = forest$treemap[,2,k]
for(i in 1L:length(bestvar)) {
if(bestvar[i] == 0) next
if(bestvar[ld[i]] > 0)
ar[bestvar[i], bestvar[ld[i]]] = ar[bestvar[i], bestvar[ld[i]]] + 1
if(bestvar[rd[i]] > 0)
ar[bestvar[i], bestvar[rd[i]]] = ar[bestvar[i], bestvar[rd[i]]] + 1
}
ar
}, mc.cores=ncores))
colnames(adj) = rownames(adj) = rownames(importance(rf))
adj = adj + t(adj)
fact = table(rf$forest$bestvar)
fact = as.numeric(fact[as.character(1:nrow(adj))])
fact[is.na(fact)] = 1
t(adj / fact) / fact
}adj = traverse1(irisbc_rf, ncores=1)
tmp = as.data.frame(adj) %>%
tibble::rownames_to_column('var1') %>%
pivot_longer(-var1, names_to='var2', values_to='value') %>%
mutate(var1 = factor(var1, levels=colnames(adj)),
var2 = factor(var2, levels=colnames(adj)))
p1 = ggplot(tmp) +
aes(x=var1, y=var2, fill=value) +
geom_tile() +
# geom_text(aes(label=round(value, 2))) +
scale_fill_gradient(name="Feature Pairwise\nCo-occurence", low="white", high="red") +
xlab("") +
ylab("") +
theme_minimal() +
theme(axis.text.x = element_text(angle=60, vjust=1, hjust=1)) +
coord_fixed()
tmp2 = adj
tmp2[lower.tri(tmp2, diag=T)] = NA
tmp = as.data.frame(tmp2) %>%
tibble::rownames_to_column('var1') %>%
pivot_longer(-var1, names_to='var2', values_to='value') %>%
mutate(var1 = factor(var1, levels=colnames(adj)),
var2 = factor(var2, levels=colnames(adj))) %>%
filter(!is.na(value))
p2 = ggplot(tmp) +
geom_histogram(aes(x=value), bins=20) +
xlab('Pairwise Co-occurence') +
ylab('Count') +
theme_minimal()
p1Ignore the diagonals on the heatmap, as these capture edge cases where sometimes the forest splits on the same variable consecutively.
The exact formula for pairwise co-occurence used here is:
\[ \begin{equation} \textrm{Co-occurence}_{i, j} = \frac{\sum_{t \in T} \sum_{n \in N_{t}} I(i \in E(n))I(j \in E(n))} {f_{i}f_{j}} , \end{equation} \]
where \(i\) and \(j\) are features, \(f_{i}\) is the count of feature label \(i\) over all nodes in a forest, \(T\) is the set of trees in a forest, \(N\) is the set of nodes in a tree, \(I\) is the indicator function, and \(E\) is the set of edges for node \(n\).
| var1 | var2 | value |
|---|---|---|
| BreastCancer__Cell.size | BreastCancer__Class | 6.163e-06 |
| BreastCancer__Normal.nucleoli | BreastCancer__Class | 5.831e-06 |
| BreastCancer__Bare.nuclei | BreastCancer__Class | 5.672e-06 |
| BreastCancer__Cell.shape | BreastCancer__Class | 5.662e-06 |
| iris__Petal.Width | iris__Species | 5.563e-06 |
| BreastCancer__Marg.adhesion | BreastCancer__Class | 5.352e-06 |
| iris__Petal.Length | iris__Species | 5.171e-06 |
| BreastCancer__Cl.thickness | BreastCancer__Class | 5.117e-06 |
| BreastCancer__Cell.size | BreastCancer__Marg.adhesion | 5.026e-06 |
| BreastCancer__Cell.size | BreastCancer__Epith.c.size | 4.996e-06 |
| BreastCancer__Cell.size | BreastCancer__Normal.nucleoli | 4.984e-06 |
| BreastCancer__Cell.shape | BreastCancer__Bare.nuclei | 4.974e-06 |
| BreastCancer__Cell.shape | BreastCancer__Normal.nucleoli | 4.963e-06 |
| BreastCancer__Marg.adhesion | BreastCancer__Bare.nuclei | 4.910e-06 |
| iris__Sepal.Width | iris__Species | 4.880e-06 |
| BreastCancer__Bare.nuclei | BreastCancer__Normal.nucleoli | 4.850e-06 |
| BreastCancer__Mitoses | BreastCancer__Class | 4.847e-06 |
| BreastCancer__Marg.adhesion | BreastCancer__Epith.c.size | 4.837e-06 |
| BreastCancer__Epith.c.size | BreastCancer__Class | 4.818e-06 |
| BreastCancer__Marg.adhesion | BreastCancer__Normal.nucleoli | 4.800e-06 |
| BreastCancer__Cell.size | BreastCancer__Mitoses | 4.729e-06 |
| BreastCancer__Epith.c.size | BreastCancer__Bare.nuclei | 4.721e-06 |
| iris__Sepal.Length | iris__Petal.Width | 4.703e-06 |
| BreastCancer__Cell.shape | BreastCancer__Epith.c.size | 4.684e-06 |
| iris__Sepal.Length | iris__Petal.Length | 4.681e-06 |
| BreastCancer__Cell.shape | BreastCancer__Marg.adhesion | 4.668e-06 |
| BreastCancer__Marg.adhesion | BreastCancer__Bl.cromatin | 4.629e-06 |
| iris__Sepal.Length | iris__Species | 4.613e-06 |
| BreastCancer__Cell.shape | BreastCancer__Mitoses | 4.582e-06 |
| BreastCancer__Bare.nuclei | BreastCancer__Bl.cromatin | 4.577e-06 |
| BreastCancer__Epith.c.size | BreastCancer__Normal.nucleoli | 4.576e-06 |
| BreastCancer__Cl.thickness | BreastCancer__Mitoses | 4.564e-06 |
| BreastCancer__Cell.size | BreastCancer__Cell.shape | 4.558e-06 |
| BreastCancer__Cl.thickness | BreastCancer__Cell.shape | 4.546e-06 |
| iris__Sepal.Width | iris__Petal.Width | 4.430e-06 |
| iris__Petal.Length | iris__Petal.Width | 4.418e-06 |
| BreastCancer__Cl.thickness | BreastCancer__Bl.cromatin | 4.403e-06 |
| BreastCancer__Cl.thickness | BreastCancer__Cell.size | 4.400e-06 |
| BreastCancer__Cl.thickness | BreastCancer__Epith.c.size | 4.395e-06 |
| BreastCancer__Epith.c.size | BreastCancer__Bl.cromatin | 4.388e-06 |
| BreastCancer__Cl.thickness | BreastCancer__Bare.nuclei | 4.383e-06 |
| BreastCancer__Bare.nuclei | BreastCancer__Mitoses | 4.375e-06 |
| BreastCancer__Cl.thickness | BreastCancer__Normal.nucleoli | 4.350e-06 |
| BreastCancer__Epith.c.size | BreastCancer__Mitoses | 4.336e-06 |
| BreastCancer__Marg.adhesion | BreastCancer__Mitoses | 4.324e-06 |
| BreastCancer__Bl.cromatin | BreastCancer__Normal.nucleoli | 4.318e-06 |
| iris__Sepal.Width | iris__Petal.Length | 4.272e-06 |
| BreastCancer__Bl.cromatin | BreastCancer__Class | 4.252e-06 |
| BreastCancer__Cell.size | BreastCancer__Bare.nuclei | 4.251e-06 |
| BreastCancer__Normal.nucleoli | BreastCancer__Mitoses | 4.248e-06 |
| iris__Sepal.Length | iris__Sepal.Width | 4.161e-06 |
| BreastCancer__Cell.shape | BreastCancer__Bl.cromatin | 4.122e-06 |
| BreastCancer__Cell.size | BreastCancer__Bl.cromatin | 4.114e-06 |
| BreastCancer__Cl.thickness | BreastCancer__Marg.adhesion | 4.103e-06 |
| BreastCancer__Bl.cromatin | BreastCancer__Mitoses | 4.062e-06 |
| iris__Sepal.Width | BreastCancer__Epith.c.size | 4.011e-06 |
| iris__Sepal.Width | BreastCancer__Cl.thickness | 3.979e-06 |
| iris__Species | BreastCancer__Bl.cromatin | 3.883e-06 |
| iris__Sepal.Width | BreastCancer__Bl.cromatin | 3.882e-06 |
| iris__Sepal.Width | BreastCancer__Marg.adhesion | 3.853e-06 |
| iris__Species | BreastCancer__Mitoses | 3.779e-06 |
| iris__Sepal.Length | BreastCancer__Bl.cromatin | 3.773e-06 |
| iris__Sepal.Length | BreastCancer__Cl.thickness | 3.770e-06 |
| iris__Petal.Width | BreastCancer__Bl.cromatin | 3.768e-06 |
| iris__Species | BreastCancer__Cl.thickness | 3.766e-06 |
| iris__Species | BreastCancer__Epith.c.size | 3.699e-06 |
| iris__Sepal.Length | BreastCancer__Epith.c.size | 3.678e-06 |
| iris__Sepal.Length | BreastCancer__Marg.adhesion | 3.659e-06 |
| iris__Petal.Length | BreastCancer__Epith.c.size | 3.653e-06 |
| iris__Petal.Length | BreastCancer__Bl.cromatin | 3.611e-06 |
| iris__Petal.Width | BreastCancer__Epith.c.size | 3.607e-06 |
| iris__Petal.Width | BreastCancer__Cl.thickness | 3.592e-06 |
| iris__Petal.Length | BreastCancer__Cl.thickness | 3.586e-06 |
| iris__Species | BreastCancer__Cell.shape | 3.575e-06 |
| iris__Species | BreastCancer__Marg.adhesion | 3.557e-06 |
| iris__Sepal.Width | BreastCancer__Bare.nuclei | 3.556e-06 |
| iris__Petal.Length | BreastCancer__Marg.adhesion | 3.549e-06 |
| iris__Petal.Length | BreastCancer__Normal.nucleoli | 3.539e-06 |
| iris__Petal.Length | BreastCancer__Bare.nuclei | 3.511e-06 |
| iris__Sepal.Length | BreastCancer__Mitoses | 3.497e-06 |
| iris__Sepal.Length | BreastCancer__Normal.nucleoli | 3.494e-06 |
| iris__Species | BreastCancer__Bare.nuclei | 3.486e-06 |
| iris__Sepal.Width | BreastCancer__Normal.nucleoli | 3.478e-06 |
| iris__Sepal.Length | BreastCancer__Bare.nuclei | 3.466e-06 |
| iris__Species | BreastCancer__Cell.size | 3.460e-06 |
| iris__Sepal.Width | BreastCancer__Cell.size | 3.433e-06 |
| iris__Sepal.Width | BreastCancer__Mitoses | 3.420e-06 |
| iris__Sepal.Width | BreastCancer__Cell.shape | 3.418e-06 |
| iris__Sepal.Length | BreastCancer__Cell.shape | 3.413e-06 |
| iris__Petal.Width | BreastCancer__Normal.nucleoli | 3.406e-06 |
| iris__Petal.Length | BreastCancer__Cell.size | 3.396e-06 |
| iris__Sepal.Length | BreastCancer__Cell.size | 3.363e-06 |
| iris__Petal.Width | BreastCancer__Bare.nuclei | 3.355e-06 |
| iris__Petal.Width | BreastCancer__Cell.size | 3.352e-06 |
| iris__Petal.Width | BreastCancer__Marg.adhesion | 3.346e-06 |
| iris__Petal.Length | BreastCancer__Mitoses | 3.331e-06 |
| iris__Species | BreastCancer__Normal.nucleoli | 3.317e-06 |
| iris__Petal.Width | BreastCancer__Mitoses | 3.271e-06 |
| iris__Petal.Length | BreastCancer__Cell.shape | 3.232e-06 |
| iris__Petal.Width | BreastCancer__Cell.shape | 3.219e-06 |
| iris__Sepal.Width | BreastCancer__Class | 3.154e-06 |
| iris__Species | BreastCancer__Class | 3.127e-06 |
| iris__Sepal.Length | BreastCancer__Class | 3.084e-06 |
| iris__Petal.Length | BreastCancer__Class | 3.060e-06 |
| iris__Petal.Width | BreastCancer__Class | 2.946e-06 |
Observations
We can define a distance measure based on the co-occurence between features with:
\[ \begin{equation} Distance_{i, j} = 1 - \frac{\textrm{Co-occurence}_{i, j} }{\max (\textrm{Co-occurence})} . \end{equation} \]
Using hierarchical clustering, we can show that the largest distance is between the two data sets.
ibc.dist = as.dist(1 - adj/max(adj))
#ibc.dist2 = as.dist(-1 * log(adj))
# this second one does not work very well at all, the distances at the leaves are way longer than
# any similarity
plot(hclust(ibc.dist, method = "ward.D2"), hang=-1)## iris__Sepal.Length iris__Sepal.Width
## 1 1
## iris__Petal.Length iris__Petal.Width
## 1 1
## iris__Species BreastCancer__Cl.thickness
## 1 2
## BreastCancer__Cell.size BreastCancer__Cell.shape
## 2 2
## BreastCancer__Marg.adhesion BreastCancer__Epith.c.size
## 2 2
## BreastCancer__Bare.nuclei BreastCancer__Bl.cromatin
## 2 2
## BreastCancer__Normal.nucleoli BreastCancer__Mitoses
## 2 2
## BreastCancer__Class
## 2
Likewise, optics can identify clusters (along with features that might be removed).
#round(sqrt(nrow(adj)))
opt = optics(ibc.dist, minPts=3) # I think minPts is set somewhat arbitrarily - i.e., however many
# features you think should support a given clustering minus 2; in this case, because iris only has
# 5 features (including true class labels, it's detected best when using minPts <= 3
ibc.umap = umap::umap(as.matrix(ibc.dist), input="dist")
tmp = as.data.frame(ibc.umap$layout) %>%
tibble::rownames_to_column("feature")
opt = extractXi(opt, xi=0.05, minimum=T)
plot(opt)tmp$cluster = factor(ifelse(opt$cluster == 0, "unclustered", as.character(opt$cluster)))
ggplot(tmp) +
aes(x=V1, y=V2) +
geom_point(aes(color=cluster), size=3) +
ggrepel::geom_text_repel(aes(label=feature)) +
ggtitle("OPTICS clustering with UMAP mapping") +
xlab('umap 1') +
ylab('umap 2') +
theme_minimal()I think there are several ways within cluster feature importance might be described. Two of which I’ve described here:
imp = data.frame(importance(irisbc_rf)) %>%
arrange(desc(MeanDecreaseAccuracy))
rfs = cache_rds({
lapply(1:(nrow(imp)-1), function(i) {
vars = rownames(imp)[1:i]
usrf(test1[,vars,drop=F], ntree=10000)
})
})
rfs = c(rfs, list(irisbc_rf))y_true = rfs[[1]]$y
tmp = bind_rows(lapply(seq_along(rfs), function(i) {
aic_obs = clfAIC(y_true[1:(length(y_true)/2)],
y_pred=rfs[[i]]$votes[1:(length(y_true)/2),], k=i)
aic = clfAIC(y_true, y_pred=rfs[[i]]$votes, k=i)
err = rfs[[i]]$err.rate[10000, 1]
vars = nrow(importance(rfs[[i]]))
data.frame(AIC=aic, AIC_obs=aic_obs, error=err, nvars=vars)
})) %>%
mutate(across(!nvars, ~ .x / max(.x))) %>%
pivot_longer(!c(nvars)) %>%
group_by(name) %>%
mutate(is.min = value == min(value))
ggplot(tmp) +
aes(x=nvars, y=value, color=name) +
geom_line(alpha=0.5) +
geom_point(aes(shape=is.min), alpha=0.5) +
scale_color_brewer(name="", palette="Dark2") +
scale_x_continuous(name="Number of variables", breaks=scales::breaks_pretty(6),
sec.axis = sec_axis(~ . * 1, breaks=1:nrow(imp), labels=rownames(imp))) +
ylim(0, 1) +
ylab("Relative value") +
theme_minimal() +
theme(axis.text.x.top = element_text(angle=60, vjust=0, hjust=0))clus = as.data.frame(ibc.umap$layout)
clus$cluster = opt$cluster
#clus
add_rfs = cache_rds({
lapply(1:2, function(i) {
vars = rownames(clus)[clus$cluster == i]
usrf(test1[,vars,drop=F], ntree=10000, seed=41)
})}, rerun=F, clean=T)
tmp2 = bind_rows(lapply(seq_along(add_rfs), function(i) {
rf = add_rfs[[i]]
vars = nrow(importance(rf))
aic_obs = clfAIC(y_true[1:(length(y_true)/2)],
y_pred=rf$votes[1:(length(y_true)/2),], k=vars)
aic = clfAIC(y_true, y_pred=rf$votes, k=vars)
err = rf$err.rate[10000, 1]
data.frame(AIC=aic, AIC_obs=aic_obs, error=err, nvars=vars)
}))
tmp = bind_rows(lapply(seq_along(rfs), function(i) {
vars = nrow(importance(rfs[[i]]))
aic_obs = clfAIC(y_true[1:(length(y_true)/2)],
y_pred=rfs[[i]]$votes[1:(length(y_true)/2),], k=vars)
aic = clfAIC(y_true, y_pred=rfs[[i]]$votes, k=vars)
err = rfs[[i]]$err.rate[10000, 1]
data.frame(AIC=aic, AIC_obs=aic_obs, error=err, nvars=vars)
}))
tmp2$AIC = tmp2$AIC / max(tmp$AIC)
tmp2$AIC_obs = tmp2$AIC_obs / max(tmp$AIC_obs)
tmp2$error = tmp2$error / max(tmp$error)
tmp = tmp %>%
mutate(across(!nvars, ~ .x / max(.x))) %>%
pivot_longer(!c(nvars)) %>%
group_by(name) %>%
mutate(is.min = value == min(value))
tmp2 = tmp2 %>%
pivot_longer(!c(nvars)) %>%
mutate(is.min=F) %>%
filter(value <= 1)
ggplot(tmp) +
geom_line(aes(x=nvars, y=value, color=name), alpha=0.5) +
geom_point(aes(x=nvars, y=value, color=name, shape=is.min), alpha=0.5) +
scale_color_brewer(name="", palette="Dark2") +
scale_x_continuous(name="Number of variables", breaks=scales::breaks_pretty(6),
sec.axis = sec_axis(~ . * 1, breaks=1:nrow(imp), labels=rownames(imp))) +
ylim(0, 1) +
ylab("Relative value") +
theme_minimal() +
geom_point(data=tmp2, mapping=aes(x=nvars, y=value, color=name), shape=3, size=3) +
theme(axis.text.x.top = element_text(angle=60, vjust=0, hjust=0))